library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
# Dataset COVID --> https://www.kaggle.com/datasets/sudalairajkumar/covid19-in-usa
" Context
Data is obtained from COVID-19 Tracking project and NYTimes. Sincere thanks to them for making it available to the public.
Coronaviruses are a large family of viruses which may cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). The most recently discovered coronavirus causes coronavirus disease COVID-19 - World Health Organization
The number of new cases are increasing day by day around the world. This dataset has information from 50 US states and the District of Columbia at daily level."
## [1] " Context\nData is obtained from COVID-19 Tracking project and NYTimes. Sincere thanks to them for making it available to the public.\n\nCoronaviruses are a large family of viruses which may cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). The most recently discovered coronavirus causes coronavirus disease COVID-19 - World Health Organization\n\nThe number of new cases are increasing day by day around the world. This dataset has information from 50 US states and the District of Columbia at daily level."
setwd("C:/Users/saksh/OneDrive/Desktop/R progamme/DA-Theory")
dataset <-read.csv("us_counties_covid19_daily.csv")
str(dataset)
## 'data.frame': 800437 obs. of 6 variables:
## $ date : chr "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" ...
## $ county: chr "Snohomish" "Snohomish" "Snohomish" "Cook" ...
## $ state : chr "Washington" "Washington" "Washington" "Illinois" ...
## $ fips : num 53061 53061 53061 17031 53061 ...
## $ cases : int 1 1 1 1 1 1 1 1 1 1 ...
## $ deaths: num 0 0 0 0 0 0 0 0 0 0 ...
View(dataset)
## Getting to know more about categorical data ##
length(unique(dataset$county))
## [1] 1929
length(unique(dataset$date))
## [1] 320
length(unique(dataset$state))
## [1] 55
for(i in names(dataset))
{
print(c(i,sum(is.na(dataset[[i]]))))
}
## [1] "date" "0"
## [1] "county" "0"
## [1] "state" "0"
## [1] "fips" "7591"
## [1] "cases" "0"
## [1] "deaths" "16733"
dataset <- na.omit(dataset)
nrow(dataset)
## [1] 776113
newdata <- group_by(dataset, state)
DeathsStates <- summarize(newdata,TotalDeaths = sum(deaths, na.rm=TRUE))
DeathsStates
## # A tibble: 53 x 2
## state TotalDeaths
## <chr> <dbl>
## 1 Alabama 413394
## 2 Alaska 8638
## 3 Arizona 845256
## 4 Arkansas 203412
## 5 California 2479209
## 6 Colorado 432318
## 7 Connecticut 974624
## 8 Delaware 123490
## 9 District of Columbia 127280
## 10 Florida 2101347
## # ... with 43 more rows
View(DeathsStates)
newdata <- group_by(dataset, state)
CasesStates <- summarize(newdata,TotalCases = sum(cases, na.rm=TRUE))
View(CasesStates)
CasesStates
## # A tibble: 53 x 2
## state TotalCases
## <chr> <int>
## 1 Alabama 24024159
## 2 Alaska 1732614
## 3 Arizona 35564131
## 4 Arkansas 12982442
## 5 California 127488265
## 6 Colorado 15541584
## 7 Connecticut 12976009
## 8 Delaware 3841367
## 9 District of Columbia 2965627
## 10 Florida 107097126
## # ... with 43 more rows
TotalDeathsandCasesStates <- merge(CasesStates,DeathsStates)
TotalDeathsandCasesStates
## state TotalCases TotalDeaths
## 1 Alabama 24024159 413394
## 2 Alaska 1732614 8638
## 3 Arizona 35564131 845256
## 4 Arkansas 12982442 203412
## 5 California 127488265 2479209
## 6 Colorado 15541584 432318
## 7 Connecticut 12976009 974624
## 8 Delaware 3841367 123490
## 9 District of Columbia 2965627 127280
## 10 Florida 107097126 2101347
## 11 Georgia 47219809 1102543
## 12 Hawaii 1609158 20057
## 13 Idaho 7285734 76386
## 14 Illinois 58791023 1759070
## 15 Indiana 24135371 732228
## 16 Iowa 16384096 246471
## 17 Kansas 10723922 125276
## 18 Kentucky 12581483 217679
## 19 Louisiana 27560697 973852
## 20 Maine 1070475 28245
## 21 Maryland 22726263 764945
## 22 Massachusetts 28741887 1917453
## 23 Michigan 29322103 1529416
## 24 Minnesota 19697867 399032
## 25 Mississippi 15549961 459146
## 26 Missouri 19493520 368377
## 27 Montana 3039448 36557
## 28 Nebraska 9094011 94108
## 29 Nevada 13109245 248426
## 30 New Hampshire 1808423 86192
## 31 New Jersey 46122131 3345525
## 32 New Mexico 6288061 162113
## 33 New York 48371010 2050253
## 34 North Carolina 34135408 574204
## 35 North Dakota 4365161 55013
## 36 Northern Mariana Islands 9994 290
## 37 Ohio 29227779 855717
## 38 Oklahoma 13916649 177146
## 39 Oregon 5741679 94902
## 40 Pennsylvania 33208997 1648592
## 41 Rhode Island 4686126 209317
## 42 South Carolina 22343348 491539
## 43 South Dakota 4792546 52542
## 44 Tennessee 31128640 387269
## 45 Texas 119483867 2261899
## 46 Utah 13588710 81563
## 47 Vermont 396976 13520
## 48 Virgin Islands 167365 2370
## 49 Virginia 24999703 548547
## 50 Washington 16153040 421336
## 51 West Virginia 2855598 55763
## 52 Wisconsin 25129253 289315
## 53 Wyoming 1527219 11733
View(TotalDeathsandCasesStates)
# Telling about total Deaths happened in each state
fig <- plot_ly(data = DeathsStates,type = "bar",x = ~state,y = ~TotalDeaths,marker = list(color = 'red'))
fig <- fig %>% layout(title = "State vs TotalDeaths")
fig
## This give side by side by side information regarding 2 important topics i.e Cases Tot and Death tot according to the state which will help to know out of the effected cases how many are dieing and how many are recovering
## As the Number of cases are far more than deaths hence dividing it by 20
newdata<-mutate(TotalDeathsandCasesStates,TotalCases = TotalCases/20)
### Grouped Bar chart ###
fig_bar <- plot_ly(data=newdata,type="bar",x=~state,y=~TotalCases,name="CasesTot")
fig_bar <- fig_bar%>% add_trace(y=~TotalDeaths,name="DeathsTot")
fig_bar <- fig_bar%>%layout(barmode='group',annotations = list(
list(text = "Number of cases is divided via 20")))
fig_bar
## This give side by side by side information regarding 2 important topics i.e Cases Tot and Death tot according to the state which will help to know out of the effected cases how many are dieing and how many are recovering
fig_bar <- plot_ly(data=newdata,type="bar",x=~state,y=~TotalCases,name="CasesTot")
fig_bar <- fig_bar%>% add_trace(y=~TotalDeaths,name="DeathsTot")
fig_bar <- fig_bar%>%layout(barmode='stack',annotations = list(text = "Number of cases is divided via 20"))
fig_bar
# This shows the trend between the TotalCases and TotalDeaths in each state
ggplot(data = TotalDeathsandCasesStates,
mapping = aes(x = TotalCases, y = TotalDeaths,size = TotalCases,color = TotalDeaths)) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# This tells us information regarding total deaths and total cases
ggplot(data = TotalDeathsandCasesStates,
mapping = aes(x = TotalCases, y = TotalDeaths,size = TotalCases,color = TotalDeaths)) +
geom_point(alpha = .7)
### Animating scatterplot
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.1.3
library(gifski)
## Warning: package 'gifski' was built under R version 4.1.3
### Animate --> Scatter Plot
scatter_plot_animate = ggplot(data=TotalDeathsandCasesStates, aes(TotalCases, TotalDeaths)) + geom_point() +
transition_states(state)
animate(scatter_plot_animate, renderer = gifski_renderer())
newdata <- group_by(dataset, date)
DeathsdateSum <- summarize(newdata,TotalDeaths = sum(deaths, na.rm=TRUE))
DeathsdateSum
## # A tibble: 320 x 2
## date TotalDeaths
## <chr> <dbl>
## 1 2020-01-21 0
## 2 2020-01-22 0
## 3 2020-01-23 0
## 4 2020-01-24 0
## 5 2020-01-25 0
## 6 2020-01-26 0
## 7 2020-01-27 0
## 8 2020-01-28 0
## 9 2020-01-29 0
## 10 2020-01-30 0
## # ... with 310 more rows
## This is a time series plot showing day to day increase/decrease in Total Deaths
fig <- plot_ly(DeathsdateSum, type = 'scatter', mode = 'lines')%>%
add_trace(x = ~date, y = ~TotalDeaths)%>%
layout(showlegend = F)
fig <- fig %>%
layout(
xaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
plot_bgcolor='#e5ecf6', width = 900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
## Warning: Can't display both discrete & non-discrete data on same axis
## Shows average density of deaths each day ##
ggplot(DeathsdateSum,
aes(x = TotalDeaths)) +
geom_density(alpha = 0.4,fill = "Cornflowerblue") +
labs(title = "Deaths distribution according to dates")
### Data containing total Cases on each date ###
newdata <- group_by(dataset, date)
CasesdateSum <- summarize(newdata,TotalCases = sum(cases, na.rm=TRUE))
CasesdateSum
## # A tibble: 320 x 2
## date TotalCases
## <chr> <int>
## 1 2020-01-21 1
## 2 2020-01-22 1
## 3 2020-01-23 1
## 4 2020-01-24 2
## 5 2020-01-25 3
## 6 2020-01-26 5
## 7 2020-01-27 5
## 8 2020-01-28 5
## 9 2020-01-29 5
## 10 2020-01-30 6
## # ... with 310 more rows
### Time Series ###
## This is a time series plot showing day to day increase/decrease in Total Cases
fig <- plot_ly(CasesdateSum, type = 'scatter', mode = 'lines')%>%
add_trace(x = ~date, y = ~TotalCases)%>%
layout(showlegend = F)
fig <- fig %>%
layout(
xaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
plot_bgcolor='#e5ecf6', width = 900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
## Warning: Can't display both discrete & non-discrete data on same axis
## Shows average density of deaths each day ##
ggplot(CasesdateSum,
aes(x = TotalCases)) +
geom_density(alpha = 0.4,fill = "Cornflowerblue") +
labs(title = "Cases each day frequency")
### This tells total number of Deaths from each state ###
### As there are many states hence reducing the numbers via considering states whose contribution is lesser than 2% as others
TotalDeathsandCasesStates
## state TotalCases TotalDeaths
## 1 Alabama 24024159 413394
## 2 Alaska 1732614 8638
## 3 Arizona 35564131 845256
## 4 Arkansas 12982442 203412
## 5 California 127488265 2479209
## 6 Colorado 15541584 432318
## 7 Connecticut 12976009 974624
## 8 Delaware 3841367 123490
## 9 District of Columbia 2965627 127280
## 10 Florida 107097126 2101347
## 11 Georgia 47219809 1102543
## 12 Hawaii 1609158 20057
## 13 Idaho 7285734 76386
## 14 Illinois 58791023 1759070
## 15 Indiana 24135371 732228
## 16 Iowa 16384096 246471
## 17 Kansas 10723922 125276
## 18 Kentucky 12581483 217679
## 19 Louisiana 27560697 973852
## 20 Maine 1070475 28245
## 21 Maryland 22726263 764945
## 22 Massachusetts 28741887 1917453
## 23 Michigan 29322103 1529416
## 24 Minnesota 19697867 399032
## 25 Mississippi 15549961 459146
## 26 Missouri 19493520 368377
## 27 Montana 3039448 36557
## 28 Nebraska 9094011 94108
## 29 Nevada 13109245 248426
## 30 New Hampshire 1808423 86192
## 31 New Jersey 46122131 3345525
## 32 New Mexico 6288061 162113
## 33 New York 48371010 2050253
## 34 North Carolina 34135408 574204
## 35 North Dakota 4365161 55013
## 36 Northern Mariana Islands 9994 290
## 37 Ohio 29227779 855717
## 38 Oklahoma 13916649 177146
## 39 Oregon 5741679 94902
## 40 Pennsylvania 33208997 1648592
## 41 Rhode Island 4686126 209317
## 42 South Carolina 22343348 491539
## 43 South Dakota 4792546 52542
## 44 Tennessee 31128640 387269
## 45 Texas 119483867 2261899
## 46 Utah 13588710 81563
## 47 Vermont 396976 13520
## 48 Virgin Islands 167365 2370
## 49 Virginia 24999703 548547
## 50 Washington 16153040 421336
## 51 West Virginia 2855598 55763
## 52 Wisconsin 25129253 289315
## 53 Wyoming 1527219 11733
## Data containing all the states having contribution of cases to the whole greater than 2% ##
newdata <- select(filter(TotalDeathsandCasesStates,TotalDeaths/sum(TotalDeaths) > 0.02),state,TotalDeaths)
## Data containing all the states having contribution of cases to the whole greater than 2% ##
sumOTHERdeaths = sum(filter(TotalDeathsandCasesStates,TotalDeaths/sum(TotalDeaths) <= 0.02)$TotalDeaths)
newdata[nrow(newdata) + 1,] <- c("Others",sumOTHERdeaths)
newdata
## state TotalDeaths
## 1 Arizona 845256
## 2 California 2479209
## 3 Connecticut 974624
## 4 Florida 2101347
## 5 Georgia 1102543
## 6 Illinois 1759070
## 7 Indiana 732228
## 8 Louisiana 973852
## 9 Maryland 764945
## 10 Massachusetts 1917453
## 11 Michigan 1529416
## 12 New Jersey 3345525
## 13 New York 2050253
## 14 Ohio 855717
## 15 Pennsylvania 1648592
## 16 Texas 2261899
## 17 Others 7342966
### Piechart ###
piechart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalDeaths,type = 'pie')
piechart
### This tells total number of Deaths from each state ###
DonutChart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalDeaths,type = 'pie',hole = 0.5)
DonutChart
### This tells total number of Cases from each state ###
### As there are many states hence reducing the numbers via considering states whose contribution is lesser than 2% as others
TotalDeathsandCasesStates
## state TotalCases TotalDeaths
## 1 Alabama 24024159 413394
## 2 Alaska 1732614 8638
## 3 Arizona 35564131 845256
## 4 Arkansas 12982442 203412
## 5 California 127488265 2479209
## 6 Colorado 15541584 432318
## 7 Connecticut 12976009 974624
## 8 Delaware 3841367 123490
## 9 District of Columbia 2965627 127280
## 10 Florida 107097126 2101347
## 11 Georgia 47219809 1102543
## 12 Hawaii 1609158 20057
## 13 Idaho 7285734 76386
## 14 Illinois 58791023 1759070
## 15 Indiana 24135371 732228
## 16 Iowa 16384096 246471
## 17 Kansas 10723922 125276
## 18 Kentucky 12581483 217679
## 19 Louisiana 27560697 973852
## 20 Maine 1070475 28245
## 21 Maryland 22726263 764945
## 22 Massachusetts 28741887 1917453
## 23 Michigan 29322103 1529416
## 24 Minnesota 19697867 399032
## 25 Mississippi 15549961 459146
## 26 Missouri 19493520 368377
## 27 Montana 3039448 36557
## 28 Nebraska 9094011 94108
## 29 Nevada 13109245 248426
## 30 New Hampshire 1808423 86192
## 31 New Jersey 46122131 3345525
## 32 New Mexico 6288061 162113
## 33 New York 48371010 2050253
## 34 North Carolina 34135408 574204
## 35 North Dakota 4365161 55013
## 36 Northern Mariana Islands 9994 290
## 37 Ohio 29227779 855717
## 38 Oklahoma 13916649 177146
## 39 Oregon 5741679 94902
## 40 Pennsylvania 33208997 1648592
## 41 Rhode Island 4686126 209317
## 42 South Carolina 22343348 491539
## 43 South Dakota 4792546 52542
## 44 Tennessee 31128640 387269
## 45 Texas 119483867 2261899
## 46 Utah 13588710 81563
## 47 Vermont 396976 13520
## 48 Virgin Islands 167365 2370
## 49 Virginia 24999703 548547
## 50 Washington 16153040 421336
## 51 West Virginia 2855598 55763
## 52 Wisconsin 25129253 289315
## 53 Wyoming 1527219 11733
## Data containing all the states having contribution of cases to the whole greater than 2% ##
newdata <- select(filter(TotalDeathsandCasesStates,TotalCases/sum(TotalCases) > 0.02),state,TotalCases)
## Data containing all the states having contribution of cases to the whole greater than 2% ##
sumOTHERCases = sum(filter(TotalDeathsandCasesStates,TotalDeaths/sum(TotalDeaths) <= 0.02)$TotalCases)
newdata[nrow(newdata) + 1,] <- c("Others",sumOTHERCases)
newdata
## state TotalCases
## 1 Arizona 35564131
## 2 California 127488265
## 3 Florida 107097126
## 4 Georgia 47219809
## 5 Illinois 58791023
## 6 Indiana 24135371
## 7 Louisiana 27560697
## 8 Massachusetts 28741887
## 9 Michigan 29322103
## 10 New Jersey 46122131
## 11 New York 48371010
## 12 North Carolina 34135408
## 13 Ohio 29227779
## 14 Pennsylvania 33208997
## 15 Tennessee 31128640
## 16 Texas 119483867
## 17 Virginia 24999703
## 18 Wisconsin 25129253
## 19 Others 404760612
### This tells total number of Cases from each state ###
piechart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalCases,type = 'pie')
piechart
DonutChart <- plot_ly(newdata) %>%add_pie(newdata, labels = ~state, values = ~TotalCases,type = 'pie',hole = 0.5)
DonutChart
## Tells regarding Places where need to focus more
ggplot(TotalDeathsandCasesStates, aes(x=TotalCases, y=TotalDeaths)) +
geom_boxplot(color ="blue")+
labs(x="TotalCases", y="TotalDeaths")+
ggtitle("box plot of Total Cases vs TotalDeaths ")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
View(TotalDeathsandCasesStates)
## Tells regarding Places where need to focus more
ggplot(TotalDeathsandCasesStates, aes(x=TotalCases, y=TotalDeaths)) +
geom_boxplot(width=0.3,color ="yellow")+
geom_violin(width=2,alpha=0.5,color ="red")+
labs(x="Total Cases", y="Total Deaths")+
ggtitle("Total Cases vs Total Deaths")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
ZomatoData <- read.csv("zomato.csv")
View(ZomatoData)